Importing Libraries¶

In [ ]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from xgboost import XGBRegressor

# ensure openpyxl==3.1.0, version 3.1.1 has a bug

Data Preparation¶

Downloading Data¶

In [ ]:
MajorRoadsEmissionsURL = 'https://data.london.gov.uk/download/london-atmospheric-emissions-inventory-2013/8c520de2-c518-4e64-932c-0071ac826742/LAEI2013_MajorRoads_EmissionsbyLink_2013.xlsx'
SupportingDocURL = 'https://data.london.gov.uk/download/london-atmospheric-emissions-inventory-2013/1f3755f3-dae8-4d39-b9b4-0cc7adb4e826/1.%20Supporting%20Information.zip'
LocalFileMajorRoadsEmissions = 'data/LAEI2013_MajorRoads_EmissionsbyLink_2013.xlsx'
LocalFileSupportingDoc = 'data/1. Supporting Information.zip'
In [ ]:
# Check if data folder exists, if not create it
if not os.path.exists('data'):
    os.makedirs('data')

# Check if data is already downloaded, if not set load_from_local to False
if os.path.exists(LocalFileMajorRoadsEmissions):
    load_from_local = True
else:
    load_from_local = False

if load_from_local:
    MajorRoadsEmissionsURL = LocalFileMajorRoadsEmissions 
xls = pd.ExcelFile(MajorRoadsEmissionsURL)
In [ ]:
if load_from_local:
    DocZip = ZipFile(LocalFileSupportingDoc)
else:
    resp = urlopen(SupportingDocURL)
    DocZip = ZipFile(BytesIO(resp.read()))
DocZip.namelist()[:10]
Out[ ]:
['1. Supporting Information/',
 '1. Supporting Information/1. Road Traffic Data/',
 '1. Supporting Information/1. Road Traffic Data/Excel/',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2008_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2010_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2013_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2020_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2025_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2030_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_VKM_AllYears_Summary.xlsx']

Load Data into dataframes¶

Road Emissions Data¶

In [ ]:
xls.sheet_names
Out[ ]:
['2013 LTS Rds', '2013 Other Major Rds']
In [ ]:
df_LTSroads = pd.read_excel(xls, '2013 LTS Rds')
print(df_LTSroads.shape)
df_LTSroads.head()
(366220, 32)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut BoroughName_ExactCut Lts Length (m) Emissions Year Pollutant ... Artic5Axle Artic6Axle PetrolCar DieselCar PetrolLgv DieselLgv LtBus Coach ElectricCar ElectricLgv
0 6253 4000000027908919 24 External NonGLA 18898 50.761449 DFT 2013 CO2 ... 0.241372 0.190560 8.761443 4.810774 0.037550 1.735121 0.0 0.0 0.0 0.0
1 6253 4000000027947931 24 External NonGLA 18895 28.592125 DFT 2013 CO2 ... 0.000000 0.000000 0.015535 0.008576 0.000000 0.000000 0.0 0.0 0.0 0.0
2 6253 4000000028013383 24 External NonGLA 15816 5.101391 DFT 2013 CO2 ... 0.027271 0.021509 0.939028 0.518684 0.004055 0.184415 0.0 0.0 0.0 0.0
3 6253 4000000028025820 24 External NonGLA 15816 3.757501 DFT 2013 CO2 ... 0.020087 0.015843 0.691654 0.382044 0.002987 0.135834 0.0 0.0 0.0 0.0
4 6253 4000000028029388 24 External NonGLA 15816 1.624593 DFT 2013 CO2 ... 0.008685 0.006850 0.299044 0.165180 0.001292 0.058729 0.0 0.0 0.0 0.0

5 rows × 32 columns

In [ ]:
df_OtherMajorRoads = pd.read_excel(xls, '2013 Other Major Rds')
print(df_OtherMajorRoads.shape)
df_OtherMajorRoads.head()
(513740, 32)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut BoroughName_ExactCut DotRef Length (m) Emissions Year Pollutant ... Artic5Axle Artic6Axle PetrolCar DieselCar PetrolLgv DieselLgv LtBus Coach ElectricCar ElectricLgv
0 5911 4000000027989878 2 External NonGLA 28440 9.714495 DFT 2013 CO2 ... 3.006694 12.549219 18.791658 19.630267 0.279151 11.005820 0.000000 0.744254 0.0 0.0
1 5911 4000000027989880 2 External NonGLA 28440 0.000000 DFT 2013 CO2 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0
2 5911 4000000027989882 2 External NonGLA 57226 8.577192 DFT 2013 CO2 ... 0.760333 2.446611 19.478135 10.300493 0.120149 7.734197 0.754408 0.868990 0.0 0.0
3 5911 4000000028014332 2 External NonGLA 57226 9.347936 DFT 2013 CO2 ... 0.823130 2.648621 20.173154 10.553940 0.123945 7.418739 0.820669 0.897038 0.0 0.0
4 5911 4000000027888882 2 External NonGLA 28440 0.000000 DFT 2013 CO2 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0

5 rows × 32 columns

In [ ]:
# concat  the two dataframes
df_RoadEmissions = pd.concat([df_LTSroads, df_OtherMajorRoads], ignore_index=True)
print(df_RoadEmissions.shape)
df_RoadEmissions.head()
(879960, 33)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut BoroughName_ExactCut Lts Length (m) Emissions Year Pollutant ... Artic6Axle PetrolCar DieselCar PetrolLgv DieselLgv LtBus Coach ElectricCar ElectricLgv DotRef
0 6253 4000000027908919 24 External NonGLA 18898.0 50.761449 DFT 2013 CO2 ... 0.190560 8.761443 4.810774 0.037550 1.735121 0.0 0.0 0.0 0.0 NaN
1 6253 4000000027947931 24 External NonGLA 18895.0 28.592125 DFT 2013 CO2 ... 0.000000 0.015535 0.008576 0.000000 0.000000 0.0 0.0 0.0 0.0 NaN
2 6253 4000000028013383 24 External NonGLA 15816.0 5.101391 DFT 2013 CO2 ... 0.021509 0.939028 0.518684 0.004055 0.184415 0.0 0.0 0.0 0.0 NaN
3 6253 4000000028025820 24 External NonGLA 15816.0 3.757501 DFT 2013 CO2 ... 0.015843 0.691654 0.382044 0.002987 0.135834 0.0 0.0 0.0 0.0 NaN
4 6253 4000000028029388 24 External NonGLA 15816.0 1.624593 DFT 2013 CO2 ... 0.006850 0.299044 0.165180 0.001292 0.058729 0.0 0.0 0.0 0.0 NaN

5 rows × 33 columns

Traffic Data¶

In [ ]:
traffic_excel = DocZip.read('1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2013_AADT-VKM.xlsx')
traffic_xls = pd.ExcelFile(BytesIO(traffic_excel))
In [ ]:
df_AADT = pd.read_excel(traffic_xls, 'MajorGrid_AADTandVKM_2013')
print(df_AADT.shape)
df_AADT.head()
(87999, 44)
Out[ ]:
RowID Year Toid GRID_ExactCut_ID Location_ExactCut BoroughName_ExactCut TLRN MotorwayNumber AADT Motorcycle AADT Taxi ... VKM_Coach VKM_Rigid2Axle VKM_Rigid3Axle VKM_Rigid4Axle VKM_Artic3Axle VKM_Artic5Axle VKM_Artic6Axle VKM_ElectricCar VKM_ElectricLgv VKM_TOTAL
0 1.0 2013.0 4.000000e+15 836.0 Outer Hillingdon Other Other 88.301916 77.112580 ... 149.248696 293.680300 55.978941 39.030966 16.191367 10.970609 3.993946 4.335614 1.235289 16605.011414
1 2.0 2013.0 4.000000e+15 2217.0 Outer Hillingdon Other Other 88.301916 77.112580 ... 98.338925 193.503902 36.884134 25.717231 10.668379 7.228458 2.631583 2.856706 0.813924 10940.494996
2 3.0 2013.0 4.000000e+15 282.0 External NonGLA Other Other 310.363572 100.322495 ... 1657.075319 12950.212101 3011.364039 2861.551314 1710.809301 1966.897025 1647.110606 221.806380 47.635028 796735.125068
3 4.0 2013.0 4.000000e+15 873.0 Outer Hillingdon Other Other 39.473081 144.548284 ... 118.008843 9777.985094 2051.227418 1024.275647 470.758531 815.631678 1959.389833 78.775616 15.287825 284144.265992
4 5.0 2013.0 4.000000e+15 2930.0 Outer Hillingdon Other Other 39.473081 144.548284 ... 401.216526 33244.027352 6973.937855 3482.419671 1600.524988 2773.054115 6661.700602 267.828056 51.976850 966057.900401

5 rows × 44 columns

Join the dataframes¶

In [ ]:
# left join df_RoadEmissions and df_AADT on the ['Toid' and 'GRID_ExactCut_ID'] column
df = pd.merge(df_RoadEmissions, df_AADT, how='left',
            left_on=['Toid', 'GRID_ExactCut_ID'],
            right_on=['Toid', 'GRID_ExactCut_ID'])
print(df.shape)
df.head()
(879960, 75)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut_x BoroughName_ExactCut_x Lts Length (m)_x Emissions Year_x Pollutant ... VKM_Coach VKM_Rigid2Axle VKM_Rigid3Axle VKM_Rigid4Axle VKM_Artic3Axle VKM_Artic5Axle VKM_Artic6Axle VKM_ElectricCar VKM_ElectricLgv VKM_TOTAL
0 6253 4000000027908919 24 External NonGLA 18898.0 50.761449 DFT 2013 CO2 ... 0.0 2383.880434 229.558857 335.509098 194.242109 247.217230 176.583736 28.990862 5.460174 104036.993985
1 6253 4000000027947931 24 External NonGLA 18895.0 28.592125 DFT 2013 CO2 ... 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.046589 0.000000 140.050860
2 6253 4000000028013383 24 External NonGLA 15816.0 5.101391 DFT 2013 CO2 ... 0.0 239.573680 23.070058 33.717777 19.520818 24.844678 17.746199 2.913505 0.548733 10455.442789
3 6253 4000000028025820 24 External NonGLA 15816.0 3.757501 DFT 2013 CO2 ... 0.0 176.461358 16.992575 24.835302 14.378333 18.299696 13.071212 2.145983 0.404177 7701.103189
4 6253 4000000028029388 24 External NonGLA 15816.0 1.624593 DFT 2013 CO2 ... 0.0 76.294807 7.346907 10.737788 6.216614 7.912054 5.651467 0.927837 0.174750 3329.647849

5 rows × 75 columns

In [ ]:
# remove unnecessary columns
columnstokeep = df.columns
for a in columnstokeep:
    if 'VKM' in a:
        columnstokeep = columnstokeep.drop(a)
df = df[columnstokeep]
print(df.shape)
df.head()
(879960, 58)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut_x BoroughName_ExactCut_x Lts Length (m)_x Emissions Year_x Pollutant ... AADT Rigid3Axle AADT Rigid4Axle AADT Artic3Axle AADT Artic5Axle AADT Artic6Axle AADT ElectricCar AADT ElectricLgv AADT TOTAL Speed (kph) Length (m)_y
0 6253 4000000027908919 24 External NonGLA 18898.0 50.761449 DFT 2013 CO2 ... 12.389882 18.108290 10.483747 13.342950 9.530679 1.564711 0.29470 5615.144325 42.269546 50.761449
1 6253 4000000027947931 24 External NonGLA 18895.0 28.592125 DFT 2013 CO2 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.004464 0.00000 13.419814 32.236053 28.592125
2 6253 4000000028013383 24 External NonGLA 15816.0 5.101391 DFT 2013 CO2 ... 6.194941 9.054145 5.241873 6.671475 4.765339 0.782356 0.14735 2807.572163 35.051885 10.202783
3 6253 4000000028025820 24 External NonGLA 15816.0 3.757501 DFT 2013 CO2 ... 6.194941 9.054145 5.241873 6.671475 4.765339 0.782356 0.14735 2807.572163 35.051885 7.515003
4 6253 4000000028029388 24 External NonGLA 15816.0 1.624593 DFT 2013 CO2 ... 6.194941 9.054145 5.241873 6.671475 4.765339 0.782356 0.14735 2807.572163 35.051885 3.249186

5 rows × 58 columns

In [ ]:
df.columns
Out[ ]:
Index(['GridId', 'Toid', 'GRID_ExactCut_ID', 'Location_ExactCut_x',
       'BoroughName_ExactCut_x', 'Lts', 'Length (m)_x', 'Emissions', 'Year_x',
       'Pollutant', 'Emissions Unit', 'Motorcycle', 'Taxi', 'Car',
       'BusAndCoach', 'Lgv', 'Rigid', 'Artic', 'Rigid2Axle', 'Rigid3Axle',
       'Rigid4Axle', 'Artic3Axle', 'Artic5Axle', 'Artic6Axle', 'PetrolCar',
       'DieselCar', 'PetrolLgv', 'DieselLgv', 'LtBus', 'Coach', 'ElectricCar',
       'ElectricLgv', 'DotRef', 'RowID', 'Year_y', 'Location_ExactCut_y',
       'BoroughName_ExactCut_y', 'TLRN', 'MotorwayNumber', 'AADT Motorcycle',
       'AADT Taxi', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv',
       'AADT LtBus', 'AADT Coach', 'AADT Rigid2Axle', 'AADT Rigid3Axle',
       'AADT Rigid4Axle', 'AADT Artic3Axle', 'AADT Artic5Axle',
       'AADT Artic6Axle', 'AADT ElectricCar', 'AADT ElectricLgv', 'AADT TOTAL',
       'Speed (kph)', 'Length (m)_y'],
      dtype='object')

Classify columns to labels and targets¶

In [ ]:
label = []
columnstokeep = df.columns
for a in columnstokeep:
    if 'AADT' in a:
        label = label + [a]
    elif 'Speed' in a:
        label = label + [a]
    elif 'Length' in a:
        label = label + [a]
label
Out[ ]:
['Length (m)_x',
 'AADT Motorcycle',
 'AADT Taxi',
 'AADT Pcar',
 'AADT Dcar',
 'AADT PLgv',
 'AADT DLgv',
 'AADT LtBus',
 'AADT Coach',
 'AADT Rigid2Axle',
 'AADT Rigid3Axle',
 'AADT Rigid4Axle',
 'AADT Artic3Axle',
 'AADT Artic5Axle',
 'AADT Artic6Axle',
 'AADT ElectricCar',
 'AADT ElectricLgv',
 'AADT TOTAL',
 'Speed (kph)',
 'Length (m)_y']
In [ ]:
# remove irrelevant columns
label.remove('AADT TOTAL')
label.remove('Speed (kph)')
label.remove('Length (m)_y')
label
Out[ ]:
['Length (m)_x',
 'AADT Motorcycle',
 'AADT Taxi',
 'AADT Pcar',
 'AADT Dcar',
 'AADT PLgv',
 'AADT DLgv',
 'AADT LtBus',
 'AADT Coach',
 'AADT Rigid2Axle',
 'AADT Rigid3Axle',
 'AADT Rigid4Axle',
 'AADT Artic3Axle',
 'AADT Artic5Axle',
 'AADT Artic6Axle',
 'AADT ElectricCar',
 'AADT ElectricLgv']
In [ ]:
target = ['Motorcycle', 'Taxi', 'Car',
          'BusAndCoach', 'Lgv', 'Rigid', 'Artic', 'Rigid2Axle', 'Rigid3Axle',
          'Rigid4Axle', 'Artic3Axle', 'Artic5Axle', 'Artic6Axle', 'PetrolCar',
          'DieselCar', 'PetrolLgv', 'DieselLgv', 'LtBus', 'Coach', 'ElectricCar',
          'ElectricLgv']

Visualize the data¶

In [ ]:
# save plot to visuals folder
if not os.path.exists('visuals'):
    os.makedirs('visuals')

# plot the correlation matrix
corr = df[label + target].corr()
plt.figure(figsize=(30, 30))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')

plt.savefig('visuals/main_correlation_matrix.png')

Derive the target¶

In [ ]:
# sum the target variables to one column
df['Total Emissions'] = df[target].sum(axis=1)

Split data for each type of pollutants¶

In [ ]:
# Split df by unique 'Pollutant' values into dictionary
df_dict = {k: v for k, v in df.groupby('Pollutant')}
df_dict.keys()
Out[ ]:
dict_keys(['CO2', 'NOx', 'PM10_Brake', 'PM10_Exhaust', 'PM10_Resusp', 'PM10_Tyre', 'PM25_Brake', 'PM25_Exhaust', 'PM25_Resusp', 'PM25_Tyre'])
In [ ]:
for k, v in df_dict.items():
    print(k, v.shape)
CO2 (87996, 59)
NOx (87996, 59)
PM10_Brake (87996, 59)
PM10_Exhaust (87996, 59)
PM10_Resusp (87996, 59)
PM10_Tyre (87996, 59)
PM25_Brake (87996, 59)
PM25_Exhaust (87996, 59)
PM25_Resusp (87996, 59)
PM25_Tyre (87996, 59)

Split the data into train and test¶

In [ ]:
from sklearn.model_selection import train_test_split

train_test_set = {}

for k, v in df_dict.items():
    x_train, x_test, y_train, y_test = train_test_split(v[label], v['Total Emissions'], test_size=0.2, random_state=42)
    # drop indexes
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    train_test_set[k] = {'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}

train_test_set.keys()
Out[ ]:
dict_keys(['CO2', 'NOx', 'PM10_Brake', 'PM10_Exhaust', 'PM10_Resusp', 'PM10_Tyre', 'PM25_Brake', 'PM25_Exhaust', 'PM25_Resusp', 'PM25_Tyre'])

Feature Selection¶

In [ ]:
# feature selection
for k, v in train_test_set.items():
    selector = SelectKBest(f_regression, k=10)
    selector.fit(v['x_train'], v['y_train'])
    v['x_train'] = selector.transform(v['x_train'])
    v['x_test'] = selector.transform(v['x_test']) 
    v['selected_features'] = [label[i] for i in selector.get_support(indices=True)]
    v['x_train'] = pd.DataFrame(v['x_train'], columns=v['selected_features'])
    v['x_test'] = pd.DataFrame(v['x_test'], columns=v['selected_features'])
    v['selector'] = selector
    print(k, v['selected_features'])
    
CO2 ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
NOx ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM10_Brake ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM10_Exhaust ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM10_Resusp ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM10_Tyre ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM25_Brake ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM25_Exhaust ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM25_Resusp ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM25_Tyre ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
In [ ]:
# create a table using dataframe for the selected features
df_selected_features = pd.DataFrame(columns=['Pollutant', 'Feature', 'Score', 'Pvalue'])
for k, v in train_test_set.items():
    for i in range(len(v['selected_features'])):
        # using concat to append a row to the dataframe
        df_selected_features = pd.concat([df_selected_features,
                                            pd.DataFrame([[k, v['selected_features'][i],
                                                              v['selector'].scores_[i],
                                                                v['selector'].pvalues_[i]]],
                                                            columns=['Pollutant', 'Feature', 'Score', 'Pvalue'])],
                                            ignore_index=True)
df_selected_features = df_selected_features.sort_values(by=['Pollutant', 'Score'], ascending=False)
df_selected_features.head()
Out[ ]:
Pollutant Feature Score Pvalue
94 PM25_Tyre AADT DLgv 44730.395889 0.0
95 PM25_Tyre AADT Rigid2Axle 43381.929381 0.0
96 PM25_Tyre AADT Rigid3Axle 30302.210535 0.0
90 PM25_Tyre Length (m)_x 27354.984412 0.0
93 PM25_Tyre AADT PLgv 24193.121504 0.0
In [ ]:
# create output folder if it does not exist
if not os.path.exists('features'):
    os.makedirs('features')

# save df_selected_features to csv in features folder
df_selected_features.to_csv('features/df_selected_features.csv', index=False)
In [ ]:
# create output folder if it does not exist
if not os.path.exists('output'):
    os.makedirs('output')

# output the training and testing sets to csv files in the output folder, with the target on the left
for k, v in train_test_set.items():
    df_train_output = pd.concat([v['y_train'], v['x_train']], axis=1)
    df_test_output = pd.concat([v['y_test'], v['x_test']], axis=1)
    df_train_output.to_csv('output/' + k + '_train.csv', index=False)
    df_test_output.to_csv('output/' + k + '_test.csv', index=False)
In [ ]:
# visualize correlation matrix of selected features
for k, v in train_test_set.items():
    corr = pd.DataFrame(v['x_train'], columns=v['selected_features']).corr()
    # set title of plt to k
    print(k)
    plt.figure(figsize=(10,10))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
    plt.savefig('visuals/' + k + '_correlation_matrix.png')
CO2
NOx
PM10_Brake
PM10_Exhaust
PM10_Resusp
PM10_Tyre
PM25_Brake
PM25_Exhaust
PM25_Resusp
PM25_Tyre

Model Development¶

Import data if already saved¶

In [ ]:
# if train_test_set is not initialized, load the csv files in the output folder
try:
    train_test_set
except NameError:
    train_test_set = {}
    for file in os.listdir('output'):
        if file.endswith('_train.csv'):
            pollutant = file[:file.rfind('_')]
            df_train = pd.read_csv('output/' + file)
            df_test = pd.read_csv('output/' + pollutant + '_test.csv')
            train_test_set[pollutant] = {'x_train': df_train.iloc[:, 1:],
                                         'x_test': df_test.iloc[:, 1:],
                                         'y_train': df_train.iloc[:, 0],
                                         'y_test': df_test.iloc[:, 0]}
            train_test_set[pollutant]['selected_features'] = list(train_test_set[pollutant]['x_train'].columns)

Standardize the data¶

In [ ]:
# stardaize the x_train and x_test
for k, v in train_test_set.items():
    # initialize the scaler
    scaler = StandardScaler()
    # fit the scaler to the x_train
    scaler.fit(v['x_train'])
    # transform the x_train and x_test
    v['x_train'] = scaler.transform(v['x_train'])
    v['x_test'] = scaler.transform(v['x_test'])
    v['x_train'] = pd.DataFrame(v['x_train'], columns=v['selected_features'])
    v['x_test'] = pd.DataFrame(v['x_test'], columns=v['selected_features'])
    # save the scaler
    v['scaler'] = scaler

Model Training¶

In [ ]:
model_dict = {}
In [ ]:
def train_eval_model(model, x_train, y_train, x_test, y_test):
    # train the model
    model.fit(x_train, y_train)
    # predict the y_test
    y_pred = model.predict(x_test)
    # evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    return {'model': model, 'mse': mse, 'rmse': rmse, 'r2': r2}
In [ ]:
def hyperparameter_tuning(model, param_grid, pollutant, model_dict, x_train, y_train, x_test, y_test):
    # initialize the grid search
    grid_search = GridSearchCV(model, param_grid, cv=5)
    # fit the grid search to the training data
    grid_search.fit(x_train, y_train)
    # save the best model
    model_dict[pollutant]['best_model'] = grid_search.best_estimator_
    # predict the y_test
    y_pred = grid_search.best_estimator_.predict(x_test)
    # evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    model_dict[pollutant]['best_mse'] = mse
    model_dict[pollutant]['best_rmse'] = rmse
    model_dict[pollutant]['best_r2'] = r2
    model_dict[pollutant]['best_params'] = grid_search.best_params_
    model_dict[pollutant]['best_score'] = grid_search.best_score_
    return model_dict

Linear Regression¶

In [ ]:
# model dict for each pollutant
LR_dict = {}

for k, v in train_test_set.items():
    LR_dict[k] = train_eval_model(
        LinearRegression(), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
    print(k, LR_dict[k]['mse'], LR_dict[k]['r2'], LR_dict[k]['rmse'])
CO2 156343.23707624507 0.6228061024706681 395.40262654191497
NOx 1.3337240174376739 0.5887914721059816 1.1548696971683317
PM10_Brake 0.0020649961618088935 0.5819056209536458 0.04544222883848121
PM10_Exhaust 0.0008067369355814341 0.5964592889324113 0.02840311489223381
PM10_Resusp 0.007830744110917233 0.5904504796613461 0.08849149174308925
PM10_Tyre 0.0002890044223154252 0.650662925495141 0.017000130067603165
PM25_Brake 0.00035156196710148816 0.5665416951804979 0.018749985789367634
PM25_Exhaust 0.0005769161710396388 0.5835788811077669 0.024019079312905374
PM25_Resusp 1.0696322799024043e-05 0.5908667099465899 0.0032705233218896395
PM25_Tyre 0.000154826853665266 0.6211541868187213 0.01244294393080938
In [ ]:
# add LR_dict to model_dict
model_dict['LR'] = LR_dict

SVM Regression¶

In [ ]:
# model dict for each pollutant
SVR_dict = {}

for k, v in train_test_set.items():
    SVR_dict[k] = train_eval_model(SVR(), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
    print(k, SVR_dict[k]['mse'], SVR_dict[k]['r2'], SVR_dict[k]['rmse'])
CO2 205750.76003393464 0.5036054481910563 453.59757498683194
NOx 1.522708912356757 0.5305243947962537 1.2339809205805239
PM10_Brake 0.003126316147087352 0.36702293573558564 0.055913470175686215
PM10_Exhaust 0.004618347885348237 -1.310159988234227 0.06795842762563181
PM10_Resusp 0.010696433148599308 0.4405743562433685 0.10342356186381954
PM10_Tyre 0.005143767056453705 -5.217581450967644 0.07172006034892682
PM25_Brake 0.0008522858713561948 -0.050825810497931556 0.029193935523601385
In [ ]:
model_dict['SVR'] = SVR_dict

Decision Tree Regression¶

In [ ]:
# model dict for each pollutant
DTR_dict = {}

for k, v in train_test_set.items():
    DTR_dict[k] = train_eval_model(DecisionTreeRegressor(), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
    print(k, DTR_dict[k]['mse'], DTR_dict[k]['r2'], DTR_dict[k]['rmse'])
CO2 10929.241735276239 0.9736320971455986 104.54301380425302
NOx 0.15603487488324774 0.9518919428892522 0.39501249965443846
PM10_Brake 0.0015277829007523578 0.6906737867017841 0.039086863531784664
PM10_Exhaust 5.574784489590433e-05 0.9721141750457166 0.007466447943694801
PM10_Resusp 0.00029638916872755665 0.984498785790138 0.017215956805462677
PM10_Tyre 1.63434706054287e-05 0.9802446614386915 0.0040427058519546906
PM25_Brake 0.0001740118539670192 0.7854520957971861 0.013191355274080795
PM25_Exhaust 4.200376590464508e-05 0.969681461408545 0.0064810312377464345
PM25_Resusp 5.013962371571591e-07 0.9808216434766511 0.0007080933816645649
PM25_Tyre 8.485112877972075e-06 0.9792377781238125 0.00291292170817756

DT Visualisation¶

In [ ]:
for k, v in DTR_set.items():
    fig, ax = plt.subplots(figsize=(120, 12))
    plot_tree(v['model'],
            ax=ax,
            feature_names=v['x_train'].columns,
            fontsize=12,
            filled=True)
    plt.savefig('visuals/' + k + '_DTR.png')

Hyperparameter Tuning¶

In [ ]:
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, None]}
for k, v in train_test_set.items():
    DTR_dict = hyperparameter_tuning(DecisionTreeRegressor(), param_grid, k, DTR_dict, v['x_train'], v['y_train'], v['x_test'], v['y_test'])
    print(k, DTR_dict[k]['best_params'], DTR_dict[k]['best_score'], DTR_dict[k]['best_mse'], DTR_dict[k]['best_r2'], DTR_dict[k]['best_rmse'])
CO2 {'max_depth': 11} 0.9693905771178665 10025.28718149829 0.9758129790801501 100.12635607819897
NOx {'max_depth': 11} 0.9529703330773728 0.15228318467905627 0.9530486492136564 0.3902347814829635
PM10_Brake {'max_depth': 9} 0.7767798587673086 0.000969543140267805 0.8036991328672336 0.031137487700002468
PM10_Exhaust {'max_depth': 11} 0.9704734523661248 6.228027933376792e-05 0.9688465631120223 0.007891785560553956
PM10_Resusp {'max_depth': 11} 0.9789643667238573 0.00027970667933761894 0.9853712834009561 0.016724433602894266
PM10_Tyre {'max_depth': None} 0.9726982625556234 1.7479963339835284e-05 0.9788709141311146 0.004180904607837314
PM25_Brake {'max_depth': 9} 0.7848551706908675 0.00015020658685994454 0.8148027983520767 0.01225587968527533
PM25_Exhaust {'max_depth': 9} 0.9711781899903531 4.5595462543963925e-05 0.9670889559314124 0.006752441228471665
PM25_Resusp {'max_depth': None} 0.9770537730977109 4.436477558612973e-07 0.9830305171795202 0.0006660688822196225
PM25_Tyre {'max_depth': 12} 0.9722327091955691 8.635630736521929e-06 0.9788694759903602 0.0029386443705426367
In [ ]:
model_dict['DTR'] = DTR_dict

Random Forest Regression¶

In [ ]:
RFR_dict = {}

for k, v in train_test_set.items():
    RFR_dict[k] = train_eval_model(RandomForestRegressor(random_state=42), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
    print(k, RFR_dict[k]['mse'], RFR_dict[k]['r2'], RFR_dict[k]['rmse'])
CO2 5182.449846763348 0.9874968147455099 71.98923424209588
NOx 0.07964791419497166 0.9754432692838015 0.2822196204996592
PM10_Brake 0.0006345126966994198 0.8715318716664411 0.02518953546017512
PM10_Exhaust 2.9831714937173756e-05 0.98507777327756 0.005461841716598327
PM10_Resusp 0.00012846486411468864 0.9932812613037385 0.01133423416533683
PM10_Tyre 1.0139446926080236e-05 0.9877438390116112 0.0031842498215561284
PM25_Brake 0.00010635037179891613 0.8688753159024897 0.010312631662137271
PM25_Exhaust 2.6157647972381708e-05 0.9811192724644566 0.005114454806954668
PM25_Resusp 2.0064062064047253e-07 0.9923255160877825 0.00044792925852245075
PM25_Tyre 4.621304177130512e-06 0.9886921312582624 0.0021497218836701907

Hyperparameter Tuning¶

In [ ]:
param_grid = {'n_estimators': [120, 150, 200]}
for k, v in train_test_set.items():
    RFR_dict = hyperparameter_tuning(RandomForestRegressor(random_state=42), param_grid, k, RFR_dict, v['x_train'], v['y_train'], v['x_test'], v['y_test'])
    print(k, RFR_dict[k]['best_params'], RFR_dict[k]['best_mse'], RFR_dict[k]['best_r2'], RFR_dict[k]['best_rmse'])
CO2 {'n_estimators': 200} 5181.916028565403 0.98749810263599 71.98552652141542
NOx {'n_estimators': 200} 0.07917984362339837 0.9755875829560235 0.2813891320278706
PM10_Brake {'n_estimators': 120} 0.0006479354559862355 0.86881420065431 0.02545457632698363
PM10_Exhaust {'n_estimators': 150} 3.0021769435629244e-05 0.9849827054505327 0.005479212483161175
PM10_Resusp {'n_estimators': 150} 0.00012748472250863533 0.9933325229104182 0.011290913271681585
PM10_Tyre {'n_estimators': 200} 9.870728996230191e-06 0.9880686545792372 0.0031417716333671025
PM25_Brake {'n_estimators': 200} 0.00010477083756667516 0.8708228025329586 0.010235762676355639
PM25_Exhaust {'n_estimators': 200} 2.6032444649002596e-05 0.9812096448801173 0.005102199981282838
PM25_Resusp {'n_estimators': 200} 1.982879971088954e-07 0.9924155037053795 0.0004452954043204302
PM25_Tyre {'n_estimators': 200} 4.686965799321414e-06 0.9885314638413071 0.0021649401375838117
In [ ]:
model_dict['RFR'] = RFR_dict

GradientBoosting Regression¶

In [ ]:
GBR_dict = {}

for k, v in train_test_set.items():
    GBR_dict[k] = train_eval_model(GradientBoostingRegressor(random_state=42), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
    print(k, GBR_dict[k]['mse'], GBR_dict[k]['r2'], GBR_dict[k]['rmse'])
CO2 8633.964127967609 0.9791696868923833 92.9191268144918
NOx 0.12515931403230093 0.9614113740154997 0.3537786229159429
PM10_Brake 0.0007463079898305557 0.848896970679011 0.02731863814011518
PM10_Exhaust 5.2317562364365925e-05 0.9738300486978162 0.007233088024099107
PM10_Resusp 0.00028458823959737835 0.9851159767998757 0.016869743317471618
PM10_Tyre 1.6312103470141583e-05 0.9802825767867993 0.004038824515888452
PM25_Brake 0.00012991753206301817 0.8398181871643908 0.011398137218994082
PM25_Exhaust 3.974716875618719e-05 0.9713102850689331 0.006304535570221425
PM25_Resusp 3.9775022749421016e-07 0.9847860931039734 0.0006306744227366528
PM25_Tyre 8.118895350512688e-06 0.9801338757561492 0.0028493675351756024

Hyperparameter Tuning¶

In [ ]:
# grid search for best parameters for GradientBoostingRegressor
param_grid = {'n_estimators': [120, 150, 200],
                'max_depth': [4, 6]}
for k, v in train_test_set.items():
    GBR_dict = hyperparameter_tuning(GradientBoostingRegressor(random_state=42),
                                    param_grid,
                                    pollutant=k,
                                    model_dict=GBR_dict,
                                    x_train=v['x_train'],
                                    y_train=v['y_train'],
                                    x_test=v['x_test'],
                                    y_test=v['y_test'])
    print(k, GBR_dict[k]['best_params'], GBR_dict[k]['best_mse'], GBR_dict[k]['best_r2'], GBR_dict[k]['best_rmse'])
In [ ]:
model_dict['GBR'] = GBR_dict

XGBoost Regression¶

In [ ]:
XGB_dict = {}

for k, v in train_test_set.items():
    XGB_dict[k] = train_eval_model(XGBRegressor(random_state=42), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
    print(k, XGB_dict[k]['mse'], XGB_dict[k]['r2'], XGB_dict[k]['rmse'])
CO2 4248.047338888423 0.9897511554537981 65.1770461043489
NOx 0.07502231582260359 0.9768694155272984 0.27390201865375796
PM10_Brake 0.0006517468906207694 0.8680425094394434 0.025529333924346115
PM10_Exhaust 2.649124024145316e-05 0.9867487238372276 0.0051469641772071
PM10_Resusp 0.00012123851173245462 0.9936592010129184 0.011010836105058264
PM10_Tyre 8.80976727638107e-06 0.989351102994401 0.002968125212382569
PM25_Brake 0.00010614136977098437 0.8691330049394411 0.010302493376410603
PM25_Exhaust 2.2290516379026105e-05 0.9839105883364085 0.004721283340261004
PM25_Resusp 2.3633240852221214e-07 0.9909603087283639 0.000486140317729575
PM25_Tyre 4.43777905787662e-06 0.9891411988547224 0.0021066036784066953

Hyperparameter Tuning¶

In [ ]:
# grid search for best parameters for XGBRegressor
param_grid = {'n_estimators': [120, 150, 200],
                'max_depth': [6, 8]}
for k, v in train_test_set.items():
    XGB_dict = hyperparameter_tuning(XGBRegressor(random_state=42),
                                        param_grid,
                                        pollutant=k,
                                        model_dict=XGB_dict,
                                        x_train=v['x_train'],
                                        y_train=v['y_train'],
                                        x_test=v['x_test'],
                                        y_test=v['y_test'])
    print(k, XGB_dict[k]['best_params'], XGB_dict[k]['best_mse'], XGB_dict[k]['best_r2'], XGB_dict[k]['best_rmse'])
CO2 {'max_depth': 6, 'n_estimators': 200} 3726.5828714324016 0.9910092413075973 61.045744089431835
NOx {'max_depth': 8, 'n_estimators': 200} 0.06234448170662684 0.9807781953314824 0.24968876968463527
PM10_Brake {'max_depth': 6, 'n_estimators': 120} 0.0006546082734722818 0.8674631727275199 0.025585313628569844
PM10_Exhaust {'max_depth': 6, 'n_estimators': 200} 2.433226087529787e-05 0.9878286744756233 0.004932774156121266
PM10_Resusp {'max_depth': 6, 'n_estimators': 200} 0.00010098858053788854 0.9947182765605491 0.0100493074655863
PM10_Tyre {'max_depth': 6, 'n_estimators': 200} 7.934530900636037e-06 0.9904090539854391 0.0028168299381815787
PM25_Brake {'max_depth': 6, 'n_estimators': 120} 0.00010611452254087318 0.869166106230087 0.010301190345822817
PM25_Exhaust {'max_depth': 6, 'n_estimators': 200} 2.0287439389856145e-05 0.9853564198157971 0.004504158011199889
PM25_Resusp {'max_depth': 8, 'n_estimators': 120} 2.227621662549712e-07 0.9914793691540763 0.0004719768704660973
PM25_Tyre {'max_depth': 6, 'n_estimators': 200} 3.982623367291088e-06 0.9902549192697659 0.001995651113619585
In [ ]:
model_dict['XGB'] = XGB_dict
dict_items([('CO2', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 4248.047338888423, 'rmse': 65.1770461043489, 'r2': 0.9897511554537981, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 3726.5828714324016, 'best_rmse': 61.045744089431835, 'best_r2': 0.9910092413075973, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9876430626771558}), ('NOx', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 0.07502231582260359, 'rmse': 0.27390201865375796, 'r2': 0.9768694155272984, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=8, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 0.06234448170662684, 'best_rmse': 0.24968876968463527, 'best_r2': 0.9807781953314824, 'best_params': {'max_depth': 8, 'n_estimators': 200}, 'best_score': 0.9781044540566057}), ('PM10_Brake', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 0.0006517468906207694, 'rmse': 0.025529333924346115, 'r2': 0.8680425094394434, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=120, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 0.0006546082734722818, 'best_rmse': 0.025585313628569844, 'best_r2': 0.8674631727275199, 'best_params': {'max_depth': 6, 'n_estimators': 120}, 'best_score': 0.841849838341607}), ('PM10_Exhaust', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 2.649124024145316e-05, 'rmse': 0.0051469641772071, 'r2': 0.9867487238372276, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 2.433226087529787e-05, 'best_rmse': 0.004932774156121266, 'best_r2': 0.9878286744756233, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9868177732475024}), ('PM10_Resusp', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 0.00012123851173245462, 'rmse': 0.011010836105058264, 'r2': 0.9936592010129184, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 0.00010098858053788854, 'best_rmse': 0.0100493074655863, 'best_r2': 0.9947182765605491, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9921829846268164}), ('PM10_Tyre', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 8.80976727638107e-06, 'rmse': 0.002968125212382569, 'r2': 0.989351102994401, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 7.934530900636037e-06, 'best_rmse': 0.0028168299381815787, 'best_r2': 0.9904090539854391, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9875156548266943}), ('PM25_Brake', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 0.00010614136977098437, 'rmse': 0.010302493376410603, 'r2': 0.8691330049394411, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=120, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 0.00010611452254087318, 'best_rmse': 0.010301190345822817, 'best_r2': 0.869166106230087, 'best_params': {'max_depth': 6, 'n_estimators': 120}, 'best_score': 0.8385838354104402}), ('PM25_Exhaust', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 2.2290516379026105e-05, 'rmse': 0.004721283340261004, 'r2': 0.9839105883364085, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 2.0287439389856145e-05, 'best_rmse': 0.004504158011199889, 'best_r2': 0.9853564198157971, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9869454283964716}), ('PM25_Resusp', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 2.3633240852221214e-07, 'rmse': 0.000486140317729575, 'r2': 0.9909603087283639, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=8, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=120, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 2.227621662549712e-07, 'best_rmse': 0.0004719768704660973, 'best_r2': 0.9914793691540763, 'best_params': {'max_depth': 8, 'n_estimators': 120}, 'best_score': 0.988481709992198}), ('PM25_Tyre', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'mse': 4.43777905787662e-06, 'rmse': 0.0021066036784066953, 'r2': 0.9891411988547224, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...), 'best_mse': 3.982623367291088e-06, 'best_rmse': 0.001995651113619585, 'best_r2': 0.9902549192697659, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.986861243187229})])

Save results¶

In [ ]:
if not os.path.exists('pickles'):
    os.makedirs('pickles')

if not os.path.exists('metrics'):
    os.makedirs('metrics')
In [ ]:
# Save the model_dict to a pickle file
with open('pickles/model_dict.pkl', 'wb') as f:
    pickle.dump(model_dict, f)
In [ ]:
# consolidate the metrics for each model from model_dict to a csv file
df_metrics = pd.DataFrame()
columns_list = ['model_name', 'pollutant', 'mse', 'rmse', 'r2', 'best_mse', 'best_rmse', 'best_r2', 'best_params']
for k, v in model_dict.items():
    for k1, v1 in v.items():
        if 'best_model' in v1.keys():
            row = [k, k1, v1['mse'], v1['rmse'], v1['r2'],
                    v1['best_mse'], v1['best_rmse'], v1['best_r2'], v1['best_params']]
        else:
            row = [k, k1, v1['mse'], v1['rmse'], v1['r2'], np.nan, np.nan, np.nan, np.nan]
        df_metrics = pd.concat([df_metrics, pd.DataFrame([row], columns=columns_list)], axis=0)
df_metrics.to_csv('metrics/consolidated_metrics.csv')

Results Summary¶

Pre-processing: feature selection¶

In [ ]:
# if df_selected_features is not initialized, load from features folder
try:
    df_selected_features
except NameError:
    df_selected_features = pd.read_csv('features/selected_features.csv')

df_selected_features.head()

Model Evaluation¶

In [ ]:
# if model_dict is not initialized, load from models folder
try:
    model_dict
except NameError:
    with open('pickles/model_dict.pkl', 'rb') as f:
        model_dict = pickle.load(f)

# if df_metrics is not initialized, load from metrics folder
try:
    df_metrics
except NameError:
    df_metrics = pd.read_csv('metrics/consolidated_metrics.csv')

df_metrics.head()